import pandas as pd
import numpy as np
import time
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,MinMaxScaler # doctest: +SKIP
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.random_projection import SparseRandomProjection
from sklearn.random_projection import GaussianRandomProjection
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import learning_curve
from silhouette import silhouette
import matplotlib.cm as cm
from sklearn.decomposition import FastICA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from scipy.stats import norm, kurtosis
%matplotlib inline
X = pd.read_pickle('../homework1/X.pkl')
y = pd.read_pickle('../homework1/y.pkl')
income = pd.read_csv('../homework1/adult.csv')
country_counts = income['native.country'].value_counts()
country_counts_df = country_counts.to_frame(name='counts')
small = list(country_counts_df[country_counts_df.counts<100].index)
income_modified = income.copy()
income_modified.loc[income['native.country'].apply(lambda x:x in small+['?']),['native.country']] = 'other'
income_modified.loc[income_modified.workclass.apply(lambda x:x in ['?']),['occupation','workclass']] = ('other','other')
income_modified.drop(['fnlwgt','education','income'],axis=1,inplace=True)
le = preprocessing.LabelEncoder()
for item in income_modified.columns[income_modified.dtypes==object]:
income_modified[item] = le.fit_transform(income_modified[item])
income_modified_stan = StandardScaler().fit_transform(income_modified)
pos = income_modified_stan[(income['capital.gain']>0)|(income['capital.gain']>0)]
X_stan = StandardScaler().fit_transform(X)
kmeans = KMeans(n_clusters=9, random_state=0)
kmeans.fit(X_stan)
pd.DataFrame(kmeans.labels_,columns=['labels']).labels.value_counts()
pd.DataFrame(kmeans.labels_,columns=['labels']).labels.value_counts()
sum(kmeans.labels_)
kmeans.labels_.var()
kmeans.cluster_centers_[0].shape
X.shape
from sklearn.mixture import GaussianMixture
co_type = ['full','diag','tied','spherical']
gm_co_scores = []
aics = []
for t in co_type:
gm = GaussianMixture(n_components=9, random_state=10,covariance_type=t)
gm_co = gm.fit_predict(X_stan)
gm_co_scores.append(silhouette_score(X_stan,gm_co))
aics.append(gm.aic(X_stan))
plt.title('EM Covariance Type Performance')
plt.xlabel('Covariance Type')
plt.ylabel('AIC Score')
plt.xticks(range(4),co_type)
#plt.plot(gm_co_scores)
plt.plot(aics)
gm_co_scores
pd.DataFrame(gm.fit_predict(X_stan),columns=['label']).label.value_counts()
pd.DataFrame(kmeans.labels_,columns=['labels']).labels.value_counts()
X.head()
kmeans.labels_[[0,2]]
pd.DataFrame(kmeans.labels_[X['native.country_Canada']==1])[0].value_counts()
pd.DataFrame(kmeans.labels_[X['native.country_other']==1])[0].value_counts()
gm.bic(X_stan)
ks = [2,4,6,8,15,20,200,2000]
import time
d = X_stan
start = time.time()
ks = [2,4,6,8,10]
total = []
scores = []
for k in ks:
print(k)
gms = GaussianMixture(n_components=k, random_state=10)
gms.fit(d)
total.append(gms.aic(d))
cluster_labels = gms.fit_predict(d)
scores.append(silhouette_score(d, cluster_labels))
print(time.time()-start)
plt.xticks(range(0,5), ks)
plt.title('Choose Cluster for EM')
plt.ylabel('AIC')
plt.plot(total)
import numpy as np
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(X)
X_embedded.shape
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
n_neighbors=2
fig = plt.figure(figsize=(15, 8))
fig.suptitle("Manifold Learning with %i points, %i neighbors"
% (1000, n_neighbors), fontsize=14)
from sklearn import manifold, datasets
n_points = 1000
X1, color = datasets.make_s_curve(32561, random_state=0)
color
i=0
ax = fig.add_subplot(2, 5, 2 + i + (i > 3))
ax.scatter(X_embedded[:, 0], X_embedded[:, 1], c=color, cmap=plt.cm.Spectral)
#ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
ax.axis('tight')
X_embedded[:, 0]
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=color, cmap=plt.cm.Spectral)
X_embedded.shape
X_embedded[:10]
income = pd.read_csv('../homework1/adult.csv')
income.na
gm.get_params()
X_standard = X_stan.copy()
import matplotlib.cm as cm
range_n_clusters = [2, 4, 6,8]
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X_stan) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X_stan)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X_stan, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X_stan, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X_stan[:, 0], X_stan[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
silhouette(X_stan,columns=[3,4])
silhouette(pos,columns=[3,4])
silhouette(income_modified_stan)
x_np = X.to_numpy()
x_np = x_np[(X['capital.gain']>0)|(X['capital.loss'])>0]
silhouette(x_np,range_n_clusters=[2,4,6,8])#,columns=[3,4])
silhouette(x_np,range_n_clusters=[2,4,6,8],columns=[3,4])
income_modified_stan[0]
income_modified.head()
import time
start = time.time()
ks = [20,200,500,1000,2000,2500]
total = []
for k in ks:
gms = GaussianMixture(n_components=k, random_state=10)
gms.fit(income_modified_stan)
total.append(gms.aic(income_modified_stan))
print(time.time()-start)
pos = income_modified_stan[(income['capital.gain']>0)|(income['capital.gain']>0)]
silhouette(income_modified_stan,range_n_clusters=[100,300,500],columns=[3,4])
income_modified.head()
colors = cm.nipy_spectral(1 / 8)
income_modified.iloc[:,2].shape
kmeans = KMeans(n_clusters=6, random_state=0)
kmeans_label = kmeans.fit_predict(X_stan)
max(gm_label)
kmeans = KMeans(n_clusters=6, random_state=0)
kmeans_label = kmeans.fit_predict(X_stan)
gm = GaussianMixture(n_components=6, random_state=0)
gm_label = gm.fit_predict(X_stan)
gm = GaussianMixture(n_components=6, random_state=0)
gm_label = gm.fit_predict(X_stan)
from sklearn.metrics.cluster import v_measure_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
print(v_measure_score(gm_label,y),
v_measure_score(kmeans_label,y))
from sklearn.metrics.cluster import v_measure_score
print(v_measure_score(gm_label,y),
adjusted_mutual_info_score(kmeans_label,y))
from sklearn.metrics.cluster import v_measure_score
print(v_measure_score(gm_label,y),
adjusted_mutual_info_score(kmeans_label,y))
print(v_measure_score(aa.ori_gm,y),v_measure_score(aa.ori_kmeans,y))
X.head()
results = pd.concat([income_modified,pd.DataFrame(kmeans_label,columns=['kmeans']),pd.DataFrame(gm_label,columns=['EM'])],
axis=1)
rr = pd.DataFrame(np.stack((y.to_numpy(),kmeans_label,gm_label)).T,columns = ['label','kmeans','EM'])
rr.head()
emdf = []
for i in range(6):
emdf.append(rr[rr.EM==i].label.value_counts())
emdf = pd.DataFrame(emdf,index=range(6)).T
emdf.index = ['EM0','EM1']
emdf
for i in range(6):
print(rr[rr.EM==i].label.value_counts())
kmeans_df = []
for i in range(6):
kmeans_df.append(rr[rr.kmeans==i].label.value_counts())
kmeans_df = pd.DataFrame(kmeans_df,index=range(6)).T
kmeans_df.index = ['Kmeans0','Kmeans1']
kmeans_df
pd.concat([emdf,kmeans_df]).to_csv('raw.csv')
for i in range(6):
print(rr[rr.kmeans==i].label.value_counts())
rr[rr.kmeans==1].EM.value_counts()
rr[rr.kmeans==0].EM.value_counts()
rr[rr.kmeans==4].EM.value_counts()
income.loc[rr[rr.EM==5].index,]
#fig, (ax1, ax2) = plt.subplots(1, 2)
combined_sort_label = []
columns = aa.columns[:-1]
i = 0
for col in columns:
i+=1
bar_x = range(max(aa[col])+1)
bar_y = aa[col].value_counts().to_numpy()
combined_sort_label.append(bar_y)
plt.bar(bar_x, bar_y)
#plt.xticks(range(max(aa[col])+1),aa[col].value_counts().index)
if i==2:
i=0
plt.title(col[:4]+' GM and Kmeans')
plt.xlabel('cluster group')
plt.legend(('GM', 'Kmeans'))
plt.show()
X_stan.shape
pca = PCA(random_state=0)
pca.fit(X_stan)
pca.explained_variance_ratio_
pca.explained_variance_ratio_[:-5].sum()
pca.explained_variance_ratio_[:34].sum()
len(pca.explained_variance_ratio_)
pca.explained_variance_ratio_.size
pca = PCA(n_components=34,random_state=0)
pca.fit(X_stan)
pca.explained_variance_ratio_#.sum()
import seaborn as sns
def plot_pca(vector_num,top_comp):
v_1 = pca.components_[vector_num]
comps = pd.DataFrame(list(zip(v_1, X.columns.values)),
columns=['weights', 'features'])
comps['abs_weights']=comps['weights'].apply(lambda x: np.abs(x))
sorted_weight_data = comps.sort_values('abs_weights', ascending=False).head(top_comp)
ax=plt.subplots(figsize=(10,6))
ax=sns.barplot(data=sorted_weight_data,
x="weights",
y="features",
palette="Blues_d")
ax.set_title("PCA Component Makeup, Component #" + str(vector_num))
plt.show()
pca_X = pca.transform(X_stan)
len(pca_X)
plot_pca(1,5)
plot_pca(2,5)
transformer = FastICA(n_components=7,
random_state=0)
X_transformed = transformer.fit_transform(X)
X_transformed.shape
from scipy.stats import norm, kurtosis
kurtosis(X_transformed)
len(transformer.components_[0])
kurtosis(X_transformed.flatten())
ks = [2,4,6,8,15,20,200,2000]
transformerlarge = FastICA(n_components=70,
random_state=0)
X_transformed_large = transformerlarge.fit_transform(X_stan)
X_transformed_large.shape
ks = [1,2,3,4,5,6,7,8,9,10]#,20,40]
def find_ica(ks,X_stan):
results = []
for k in ks:
transformerlarge = FastICA(n_components=k,
random_state=0)
X_transformed_large = transformerlarge.fit_transform(X_stan)
results.append(kurtosis(X_transformed_large))#.mean()
return results
ks = [2,3,5,10,40,50]
results = find_ica(ks,X_stan)
rs = []
for r in results:
rs.append(r.mean())
plt.title('Number of Components for ICA ')
plt.xlabel('Number of Components')
plt.ylabel('Kurtosis')
plt.xticks(range(6),ks)
plt.plot(rs)
rs = []
for r in results:
rs.append(r.mean())
transformerlarge = FastICA(n_components=5,
random_state=0)
X_transformed_large = transformerlarge.fit_transform(X_stan)
len(transformerlarge.components_)
ica_X = transformerlarge.transform(X_stan)
import seaborn as sns
def plot_ica(vector_num,top_comp,transformerlarge=transformerlarge):
v_1 = transformerlarge.components_[vector_num]
comps = pd.DataFrame(list(zip(v_1, X.columns.values)),
columns=['weights', 'features'])
comps['abs_weights']=comps['weights'].apply(lambda x: np.abs(x))
sorted_weight_data = comps.sort_values('abs_weights', ascending=False).head(top_comp)
ax=plt.subplots(figsize=(10,6))
ax=sns.barplot(data=sorted_weight_data,
x="weights",
y="features",
palette="Blues_d")
ax.set_title("ICA Component Makeup, Component #" + str(vector_num))
plt.show()
plot_ica(0,5)
plot_ica(1,5)
from sklearn.random_projection import SparseRandomProjection
from sklearn.random_projection import GaussianRandomProjection
rng = np.random.RandomState(42)
gauRP = GaussianRandomProjection(random_state=rng,n_components=5)
SRRp = SparseRandomProjection(random_state=rng,n_components=5)
gau = gauRP.fit_transform(X_stan)
sr = SRRp.fit_transform(X_stan)
transformer = SparseRandomProjection(random_state=rng)
rng = np.random.RandomState(42)
np.dot(gau,gauRP.components_)
from sklearn.metrics import mean_squared_error
mean_squared_error(np.dot(gau,gauRP.components_), X_stan)
ks = [10,20,30,35,45,50,59]#,20,40]
#ks = [5]
def find_rand_proj(ks=ks,X_stan=X_stan):
results = []
for k in ks:
error = 0
for i in range(5):
gauRP = GaussianRandomProjection(random_state=i,n_components=k)
SRRp = SparseRandomProjection(random_state=i,n_components=k)
gau = SRRp.fit_transform(X_stan)
X_transformed_large = SRRp.fit_transform(X_stan)
error += mean_squared_error(np.dot(gau,SRRp.components_.toarray()), X_stan)
results.append(error/k)
return results
randp_result = find_rand_proj()
ks = [10,20,30,35,45,50,59]#,20,40]
#ks = [5]
def find_rand_proj(ks=ks,X_stan=X_stan):
results = []
for k in ks:
error = 0
for i in range(5):
gauRP = GaussianRandomProjection(random_state=i,n_components=k)
#SRRp = SparseRandomProjection(random_state=rng,n_components=5)
gau = gauRP.fit_transform(X_stan)
X_transformed_large = gauRP.fit_transform(X_stan)
error += mean_squared_error(np.dot(gau,gauRP.components_), X_stan)
results.append(error/k)
return results
plt.title('Choosing Number of Components for Randomized Projection')
plt.xlabel('number of clusters')
plt.ylabel('MSE')
plt.xticks(range(len(ks)),ks)
plt.plot(randp_result)
plt.title('Choosing Number of Components for SRR Randomized Projection')
plt.xlabel('number of clusters')
plt.ylabel('MSE')
plt.xticks(range(len(ks)),ks)
plt.plot(randp_result)
randp_result
randp_result
gauRP = GaussianRandomProjection(random_state=0,n_components=45)
rp_X = gauRP.fit_transform(X_stan)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_stan, y)
X.columns[tree_importance_sorted_idx[:15]][:15]
tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
tree_importance_sorted_idx
clf.feature_importances_[tree_importance_sorted_idx[:15]]
clf.feature_importances_[tree_importance_sorted_idx[-15:]].sum()
clf.feature_importances_[tree_importance_sorted_idx[-15:]].sum()
# result = permutation_importance(clf, X_stan, y, n_repeats=1,
# random_state=42)
# perm_sorted_idx = result.importances_mean.argsort()
tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5
#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices[-5:],
clf.feature_importances_[tree_importance_sorted_idx[-5:]], height=0.7)
ax1.set_yticklabels(X.columns[tree_importance_sorted_idx][-5:])
ax1.set_yticks(tree_indices[-5:])
#ax1.set_ylim((0, len(clf.feature_importances_)))
# ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
# labels=X.feature_names[perm_sorted_idx])
#fig.tight_layout()
plt.title('Tree Importance')
plt.show()
tree_X = X_stan[:,tree_importance_sorted_idx[-15:]]
X.iloc[0:3,tree_importance_sorted_idx[-15:]]
# result = permutation_importance(clf, X_stan, y, n_repeats=1,
# random_state=42)
# perm_sorted_idx = result.importances_mean.argsort()
tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5
#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf.feature_importances_)))
# ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
# labels=X.feature_names[perm_sorted_idx])
#fig.tight_layout()
plt.title('Tree Importance')
plt.show()
t2
#start = time.time()
transformer = FastICA(n_components=10,
random_state=0)
pca = PCA(n_components=10,random_state=0)
gauRP = GaussianRandomProjection(random_state=0,n_components=10)
t0 = time.time()
gauRP.fit_transform(X_stan)
t1 = time.time()#-start
transformer.fit_transform(X_stan)
t2 = time.time()#-t1
pca.fit_transform(X_stan)
t3 = time.time()#-t2
print(t1-t0,t2-t1,t3-t2)
time.time()
ica_X.shape
pca_X.shape
rp_X.shape
tree_X.shape
data = np.concatenate([ica_X,pca_X,rp_X,tree_X],axis = 1)
# with open('data.npy', 'wb') as f:
# np.save(f, np.array([1, 2]))
# with open('data.npy', 'wb') as f:
# np.save(f, data)
with open('data.npy', 'rb') as f:
data = np.load(f)
ks = [2,4,6,8,15,20,100]
def find_gm_k(ks,datas):
start = time.time()
totals = []
for income_modified_stan in datas:
total = []
for k in ks:
print(k)
gms = GaussianMixture(n_components=k, random_state=10)
gms.fit(income_modified_stan)
total.append(gms.aic(income_modified_stan))
totals.append(total)
print(time.time()-start)
return totals
ica_X = data[:,:5]
pca_X = data[:,5:39]
rp_X = data[:,39:84]
tree_X = data[:,84:]
rp_X.shape
ks = [2,4,8,15,20,100]
ica_gm = find_gm_k(ks,[ica_X,pca_X,rp_X,tree_X,X_stan])
rr = ica_gm[:-2]#.append(aa[0])
rr.append(aa[0])
rr
ks = [2,4,20,50,70,100]
ica_gm = find_gm_k(ks,[ica_X,pca_X,rp_X,tree_X,X_stan])
ica_gm[-1]
bb = pd.DataFrame(np.array(ica_gm).T,columns=['ica','pca','rp','tree','ori'])
bb.to_pickle('em_metric.pkl')
ax = bb.plot()
ax.set_title('AIC vs Number of Clusters')
ax.set_xticks(range(len(ks)),ks)
ax.set_xlabel('Number of Components')
ks = ['2','4','20','50','70','100']
bb.index = ks
ax = bb.plot()
ax.set_title('AIC vs Number of Clusters')
ax.set_xticks(range(len(ks)),ks)
ax.set_xlabel('Number of Components')
bb
bb.plot()
bb
bb = pd.read_pickle('em_metric.pkl')
len(ica_gm)
fig, ((ax1, ax2),(ax3,ax4),(ax5,ax6)) = plt.subplots(3, 2, figsize=(12, 8))
# ax1.barh(tree_indices,
# clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
# ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
# ax1.set_yticks(tree_indices)
# ax1.set_ylim((0, len(clf.feature_importances_)))
ks = [2,4,20,50,70,100]
ax1.plot(ks,ica_gm[0])
ax1.set_title('ICA')
ax2.plot(ks,ica_gm[1])
ax2.set_title('PCA')
ax3.plot(ks,ica_gm[2])
ax3.set_title('RP')
ax4.plot(ks,ica_gm[3])
ax4.set_title('Tree')
ax5.plot(ks,ica_gm[4])
ax5.set_title('Original')
ax6.axis('off')
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
wspace=0.35)
plt.suptitle(("GaussianMixture AIC Score"),
fontsize=14, fontweight='bold')
fig, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2, 2, figsize=(12, 8))
# ax1.barh(tree_indices,
# clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
# ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
# ax1.set_yticks(tree_indices)
# ax1.set_ylim((0, len(clf.feature_importances_)))
ax1.plot(ica_gm[0])
ax2.plot(ica_gm[1])
ax3.plot(ica_gm[2])
ax4.plot(ica_gm[3])
silhouette(ica_X)
silhouette(pca_X)
silhouette(tree_X)
silhouette(rp_X)
silhouette(X_stan)
from scipy.spatial.distance import cdist
K = [2,5,10,20,30,50,100]
def kmeans_elbow(K,X_stan,seed=0):
distortions = []
inertias = []
silhouette_avg = []
mapping1 = {}
mapping2 = {}
for k in K:
#Building and fitting the model
kmeanModel = KMeans(n_clusters=k,random_state=seed).fit(X_stan)
#kmeanModel.fit(X_stan)
distortions.append(sum(np.min(cdist(X_stan, kmeanModel.cluster_centers_,
'euclidean'),axis=1)) / X_stan.shape[0])
inertias.append(kmeanModel.inertia_)
silhouette_avg.append(silhouette_score(X_stan, kmeanModel.predict(X_stan)))
# mapping1[k] = sum(np.min(cdist(X_stan, kmeanModel.cluster_centers_,
# 'euclidean'),axis=1)) / X_stan.shape[0]
# mapping2[k] = kmeanModel.inertia_
return silhouette_avg,distortions,inertias
start = time.time()
silhouette_score(X_stan, kmeanModel.predict(X_stan))
time.time()-start
K = [10,40,70,80,90,100]
distortions,inertias = kmeans_elbow(K,X_stan)
K = [2,4,6,8,10]
sih,distortions,inertias = kmeans_elbow(K,X_stan,seed = 10)
plt.plot(K[:-1], sih[:-1], 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Silhouette')
plt.title('The Elbow Method using Silhouette')
plt.show()
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()
K = [10,40,70,80,90,100]
#K = [2,4,6,10,20,50,100]
s,distortions,inertias = kmeans_elbow(K,pca_X)
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('PCA Elbow Method using Distortion')
plt.show()
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('PCA Elbow Method using Inertia')
plt.show()
K = [10,40,70,80,90,100]
K = [2,4,6,10,20,50,100]
distortions,inertias = kmeans_elbow(K,ica_X)
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('ICA Elbow Method using Distortion')
plt.show()
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('ICA Elbow Method using Inertia')
plt.show()
K = [10,40,70,80,90,100]
K = [2,4,6,10,20,50,100]
distortions,inertias = kmeans_elbow(K,rp_X)
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('RP Elbow Method using Distortion')
plt.show()
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('RP Elbow Method using Inertia')
plt.show()
K = [10,40,70,80,90,100]
K = [2,4,6,8,9]#,20,50,100]
sih,distortions,inertias = kmeans_elbow(K,rp_X)
plt.plot(K, sih, 'bx-')
plt.xlabel('Values of K')
plt.title('Silhouette Score')
plt.show()
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('RP Elbow Method using Distortion')
plt.show()
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('RP Elbow Method using Inertia')
plt.show()
silhouette_avg = silhouette_score(X_stan, cluster_labels)
K = [10,40,70,80,90,100]
K = [2,4,6,10,20,50,100]
distortions,inertias = kmeans_elbow(K,tree_X)
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('Tree Elbow Method using Distortion')
plt.show()
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('Tree Elbow Method using Inertia')
plt.show()
silhouette(X_stan,range_n_clusters=[50,80,100])
silhouette(ica_X,range_n_clusters=[50,80,100])
silhouette(pca_X,range_n_clusters=[50,80,100])
silhouette(rp_X,range_n_clusters=[50,80,100])
silhouette(tree_X,range_n_clusters=[50,80,100])
silhouette(X_,range_n_clusters=[50,80,100])
#gm = GaussianMixture(n_components=9, random_state=10)
kmeans = KMeans(n_clusters=9, random_state=0)
kmeans.fit(ica_X)
kmeans.labels_
n_comp = [20,50,50,20,50]
combined_data = [ica_X,pca_X,rp_X,tree_X,X_stan]
labels = []
for i in range(5):
gm = GaussianMixture(n_components=n_comp[i], random_state=0)
kmeans = KMeans(n_clusters=n_comp[i], random_state=0)
labels.append(gm.fit_predict(combined_data[i]))
labels.append(kmeans.fit_predict(combined_data[i]))
labels_rp = []
for i in range(5):
gauRP = GaussianRandomProjection(random_state=i,n_components=45)
multi_rp_X = gauRP.fit_transform(X_stan)
gm = GaussianMixture(n_components=50, random_state=0)
kmeans = KMeans(n_clusters=50, random_state=0)
labels_rp.append(gm.fit_predict(multi_rp_X))
labels_rp.append(kmeans.fit_predict(multi_rp_X))
multi_rp_label = pd.DataFrame(np.array(labels_rp).T)#.head()
order_by_count = []
for i in multi_rp_label.columns:
order_by_count.append(multi_rp_label[i].value_counts())
ax = pd.DataFrame(np.array(order_by_count).T).iloc[:5,[0,2,3,6,8]].plot.bar(
title='EM cluster distribution for Random Projection')
ax.set_xlabel('Largest 5 Clusters')
ax = pd.DataFrame(np.array(order_by_count).T).iloc[:5,[1,3,5,7,9]].plot.bar(
title='Kmeans cluster distribution for Random Projection')
ax.set_xlabel('Largest 5 Clusters')
multi_rp_label[multi_rp_label[0]==47][1].value_counts()
multi_rp_label[multi_rp_label[0]==47][4].value_counts()
with open('reduction_clustering.npy', 'wb') as f:
np.save(f, labels)
with open('reduction_clustering.npy', 'rb') as f:
aa = np.load(f)
labels.append(y)
aa = pd.DataFrame(np.array(labels).T)
aa.to_pickle('reduction_clustering.pkl')
aa = pd.read_pickle('reduction_clustering.pkl')
aa.columns = ['ica_gm','ica_kmeans','pca_gm','pca_means','rp_gm','rp_kmeans','tree_gm','tree_kmeans','ori_gm','ori_kmeans','y']
aa.head()
columns = aa.columns[-1]
columns
#fig, (ax1, ax2) = plt.subplots(1, 2)
combined_sort_label = []
columns = aa.columns[:-1]
i = 0
for col in columns:
i+=1
bar_x = range(max(aa[col])+1)
bar_y = aa[col].value_counts().to_numpy()
combined_sort_label.append(bar_y)
plt.bar(bar_x, bar_y)
#plt.xticks(range(max(aa[col])+1),aa[col].value_counts().index)
if i==2:
i=0
plt.title(col[:4]+' GM and Kmeans')
plt.xlabel('cluster group')
plt.legend(('GM', 'Kmeans'))
plt.show()
aa[aa.ica_gm==i].y.value_counts().to_numpy()
aaica=[]
for i in range(20):
aaica.append(aa[aa.ica_gm==i].y.value_counts().to_numpy())
df_aa.sort_values(by='per',ascending=True)
df_aa = pd.DataFrame(aaica)
df_aa['per'] = df_aa.apply(lambda x:x[0]/x[1] if x[1] != 'NaN' else 0,axis=1)
df_aa.sort_values(by='per',ascending=False).to_csv('sortafter.csv')
aa.hist(figsize=(15,12))
start = time.time()
sil_score = []
cd = [ica_X,ica_X,pca_X,pca_X,rp_X,rp_X,tree_X,tree_X,X_stan,X_stan]
for i in range(10):
print(i,time.time()-start)
data = cd[i]
sil_score.append(silhouette_score(data,labels[i]))
sil_score
gauRP = GaussianRandomProjection(random_state=10,n_components=45)
rp_X = gauRP.fit_transform(X_stan)
gm_rp = gm.fit_predict(rp_X)
km_rp = kmeans.fit_predict(rp_X)
print(silhouette_score(rp_X,gm_rp),silhouette_score(rp_X,km_rp))
gm.aic(rp_X)
gm.fit(pca_X)
gm.aic(pca_X)
gm_pca = gm.fit_predict(pca_X)
silhouette_score(pca_X,gm_pca)
print(silhouette_score(rp_X,gm_rp),silhouette_score(rp_X,km_rp))
rr = pd.DataFrame(np.stack((y.to_numpy(),kmeans_label,gm_label)).T,columns = ['label','kmeans','EM'])
rr.head()
aa.head()
for i in range(6):
print(aa[aa.ica_gm==i].y.value_counts())
for i in range(6):
print(aa[aa.ica_kmeans==i].y.value_counts())
rr[rr.kmeans==1].EM.value_counts()
rr[rr.kmeans==0].EM.value_counts()
rr[rr.kmeans==4].EM.value_counts()
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
if axes is None:
_, axes = plt.subplots(3, 1, figsize=(5, 20))
axes[0].set_title(title)
if ylim is not None:
axes[0].set_ylim(*ylim)
axes[0].set_xlabel("Training examples")
axes[0].set_ylabel("Error")
train_sizes, train_scores, test_scores, fit_times, _ = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
train_sizes=train_sizes,
return_times=True)
train_scores_mean = np.mean(1-train_scores, axis=1)
train_scores_std = np.std(1-train_scores, axis=1)
test_scores_mean = np.mean(1-test_scores, axis=1)
test_scores_std = np.std(1-test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
# Plot learning curve
axes[0].grid()
axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
axes[0].legend(loc="best")
# Plot n_samples vs fit_times
axes[1].grid()
axes[1].plot(train_sizes, fit_times_mean, 'o-')
axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std, alpha=0.1)
axes[1].set_xlabel("Training examples")
axes[1].set_ylabel("fit_times")
axes[1].set_title("Scalability of the model")
# Plot fit_time vs score
axes[2].grid()
axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1)
axes[2].set_xlabel("fit_times")
axes[2].set_ylabel("Error")
axes[2].set_title("Performance of the model")
return plt
import numpy as np
from sklearn.model_selection import train_test_split
data = X_stan
X_train, X_test, y_train, y_test = train_test_split(
data, y, test_size=0.2, random_state=42)
from sklearn.model_selection import GridSearchCV
start = time.time()
param_grid = [
{'alpha': [0.1,10,100],'hidden_layer_sizes':[(5,),(5,4),(20,),(20,4)]}
]
mlp = MLPClassifier( random_state=1)
search = GridSearchCV(mlp, param_grid, cv=5)
search.fit(X_train,y_train)
time.time()-start
search.best_params_
search.best_params_['alpha']
mlp = MLPClassifier( alpha=0.1,#search.best_params_['alpha'],
hidden_layer_sizes=(20,),#search.best_params_['hidden_layer_sizes'],
random_state=1)
mlp.fit(X_train,y_train)
mlp.score(X_test,y_test)
def nn_with_reduction(df,params={'alpha': 0.1, 'hidden_layer_sizes': (20,)}
,grid=True):
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(
df, y, test_size=0.2, random_state=42)
param_grid = [{'alpha': [0.01,0.1,10],'hidden_layer_sizes':[(20,)]#(5,),(5,4),(20,),(20,4)]
}]
mlp = MLPClassifier( random_state=1)
search = GridSearchCV(mlp, param_grid, cv=5)
t=time.time()
if grid:
search.fit(X_train,y_train)
t = time.time()-t
print(search.best_params_)
else:
search.best_params_ = params
mlp = MLPClassifier( alpha=search.best_params_['alpha'],
hidden_layer_sizes=search.best_params_['hidden_layer_sizes'],
random_state=1)
mlp.fit(X_train,y_train)
print(time.time()-start)
return mlp,mlp.score(X_test,y_test),t
rp_m,s = nn_with_reduction(rp_X)
s
rp_m,s = nn_with_reduction(rp_X,False,{'hidden_layer_sizes': (40,)})
s
nn_with_reduction(ica_X)
X_stan[:,:10].shape
ica_X.shape
complete_ica = np.concatenate([X_stan[:,:10],ica_X],axis=1)
mlp_ica,s = nn_with_reduction(complete_ica,{'alpha':0.001,'hidden_layer_sizes': (50,4)},False)
s
mlp_ori,s = nn_with_reduction(X_stan,{'alpha':0.1,'hidden_layer_sizes': (5,4)},False)
s
nn_with_reduction(pca_X)
nn_with_reduction(X_stan)
nn_with_reduction(tree_X)
mlp_tree,s = nn_with_reduction(tree_X,{'alpha':0.1,'hidden_layer_sizes': (5,4)},False)
s
mlp_tree,s = nn_with_reduction(tree_X,{'alpha':0.1,'hidden_layer_sizes': (20,)},False)
s
training_time = []
xlabel = ['ica','tree','pca','ori']
cd =[ica_X,tree_X,pca_X,X_stan]
for d in cd:
start = time.time()
mlp.fit(d,y)
training_time.append(time.time()-start)
plt.title('Training Time vs Feature Reduction')
plt.xlabel('Feature Reduction Method')
plt.ylabel('Time in Seconds')
plt.xticks(range(4),xlabel)
plt.plot(training_time)
training_time
start = time.time()
mlp.fit(ica_X,y)
training_time.append(time.time()-start)
rp
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
data = tree_X
X_train, X_test, y_train, y_test = train_test_split(
data, y, test_size=0.2, random_state=42)
y_pred = mlp_tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
data = X_stan
X_train, X_test, y_train, y_test = train_test_split(
data, y, test_size=0.2, random_state=42)
y_pred = mlp_ori.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
aa
col_names=['ica','pca','rp','tree']
for col in aa.columns:
df = aa[col]
ori_gm = aa.ori_gm.apply(lambda x:str(x))
ori_kmeans = aa.ori_kmeans.apply(lambda x:str(x))
cluster_features = StandardScaler().fit_transform(pd.get_dummies(ori_gm))
cluster_features.shape
gm_label
kmeans_label
combined_data[0].shape
q5_data = []
for d in combined_data:
q5_data.append(np.concatenate([d,cluster_features],axis=1))
q5_data[0]=np.concatenate([complete_ica,cluster_features],axis=1)
for q in q5_data:
print(q.shape)
q5model=[]
q5score=[]
q5time=[]
for q in q5_data[:-1]:
model,s,t = nn_with_reduction(q)
q5model.append(model)
q5score.append(s)
q5time.append(t)
q5score
q5_data[0]=np.concatenate([complete_ica,cluster_features],axis=1)
q5_data[0].shape
plt.plot(q5score)
mlp_tree,s,t = nn_with_reduction(q5_data[-1],{'alpha':0.01,'hidden_layer_sizes': (20,)},False)
s
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
data = q5_data[-1]
model = mlp_tree
X_train, X_test, y_train, y_test = train_test_split(
data, y, test_size=0.2, random_state=42)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
mlp_rp_5,s,t = nn_with_reduction(q5_data[-2],{'alpha':0.01,'hidden_layer_sizes': (20,)},False)
s
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
data = q5_data[-2]
model = mlp_rp_5
X_train, X_test, y_train, y_test = train_test_split(
data, y, test_size=0.2, random_state=42)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
q4model=[]
q4score=[]
q4time=[]
q4_data = [complete_ica,ica_X,pca_X,tree_X]
for q in q4_data:
model,s,t = nn_with_reduction(q)
q4model.append(model)
q4score.append(s)
q4time.append(t)
q4score
q5score
cluster_features = StandardScaler().fit_transform(pd.get_dummies(ori_gm))
cluster_features.shape
q5_data = []
for d in combined_data:
q5_data.append(np.concatenate([d,cluster_features],axis=1))
q5_data[0]=np.concatenate([complete_ica,cluster_features],axis=1)
q5model=[]
q5score=[]
q5time=[]
for q in q5_data[:-1]:
model,s,t = nn_with_reduction(q)
q5model.append(model)
q5score.append(s)
q5time.append(t)
q5score